# -*- coding: utf-8 -*-
"""
Code that matches sections within the same congress
Created on Fri Apr 21 12:35:34 2023

@author: jgelman
"""

import difflib
import pandas as pd
import numpy as np
import csv
import time
import sys, os
import textdistance
from nltk.tokenize import word_tokenize  

cong=[111] #Set Congress 
secs_all=pd.read_csv('/replication/cleaned_sections/111/111_sections_noboilerplate.csv') #Change to specific congressional term data

secs_all = secs_all.drop(['Unnamed: 0'], axis=1)
secs_all = secs_all[~secs_all['bill'].str.contains("enr")]
secs_all['date'] = pd.to_datetime(secs_all['date'])
secs_intro = secs_all[secs_all['bill'].str.contains("ih|is")]

#Remove enacted sections from introduced sections
os.chdir('/replication/enacted_sections')
enacted=pd.read_csv('secs111_enacted.csv') #Change Congress as needed
enacted_secs = enacted.groupby(enacted['sec1']).count() #Collapse sections into single file for each section
enacted_secs['section'] = enacted_secs.index    

enacted_secs= enacted_secs[['section', 'match']]
enacted_secs=enacted_secs.rename(columns={"match":"enacted_match"})

unenacted = pd.merge(secs_intro, enacted_secs, on='section', how='left') #Create df of unenacted sections
unenacted = unenacted[unenacted['enacted_match'].isnull()]

os.chdir('/replication/same_congress')

header=['bill1','sec1','raw_txt1','clean_txt1', 'date1','bill2','sec2','raw_txt2','clean_txt2', 'date2','txt1_ad_5', 'txt2_ad_5', 'scope_10', 'num_blocks', 'perblock_txt1', 'perblock_txt2', "first_shared_keys_2", "first_txt1_ad_2", "first_txt2_ad_2", 'match', 'x', 'y']
with open('samecong_matches111.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header)

intro_sectlist = unenacted.values.tolist()
all_sectlist = secs_all.values.tolist()
counter1 = len(intro_sectlist)
counter2 = len(all_sectlist)

#Function that uses decision tree to match sections. 
#Run functions in decisiontree_functions.py and decision_tree.py beforehand

def pairwise_dice(num1, txts1, num2, txts2):
    for x in range(num1):
         if x<0: pass #print(x)
         elif isinstance(txts1[x][3], float)==True: pass
         else:
             print(x)
             for y in range(num2):
                 new_sect = []
                 if txts1[x][5]>=txts2[y][5]: pass #Only look at sections that come after the introduced bill
                 elif isinstance(txts2[y][3], float)==True: pass
                 else:
                     dice_comp = dice(txts1[x][3],txts2[y][3]) 
                     if dice_comp<=0.5: pass
                     else:
                         score5 = gen_hash(txts1[x][3],txts2[y][3],5,txts1[x][0],txts2[y][0]) 
                         score10 = gen_hash(txts1[x][3],txts2[y][3],10,txts1[x][0],txts2[y][0]) 
                         block1 = blocks(txts1[x][3],txts2[y][3])
                         score_first2 = gen_hash_first100(txts1[x][3],txts2[y][3],5,txts1[x][0],txts2[y][0])
                         new_sect.append(txts1[x][0]) #sec1
                         new_sect.append(txts1[x][1]) #sec1
                         if len(txts1[x][2]) < 30000: new_sect.append(txts1[x][2]) #raw
                         else: new_sect.append('')
                         if len(txts1[x][3]) < 30000: new_sect.append(txts1[x][3]) #processed
                         else: new_sect.append('') 
                         new_sect.append(txts1[x][5]) #date
                         new_sect.append(txts2[y][0]) #sec2
                         new_sect.append(txts2[y][1]) #sec2
                         if len(txts2[y][2]) < 30000: new_sect.append(txts2[y][2]) #raw
                         else: new_sect.append('') 
                         if len(txts2[y][3]) < 30000: new_sect.append(txts2[y][3]) #processed
                         else: new_sect.append('') 
                         new_sect.append(txts2[y][5]) #date
                         new_sect.append(score5[3])
                         new_sect.append(score5[4])
                         new_sect.append(score10[5])
                         new_sect.append(block1[1])
                         new_sect.append(block1[4])
                         new_sect.append(block1[5])
                         new_sect.append(score_first2[2])
                         new_sect.append(score_first2[3])
                         new_sect.append(score_first2[4])
                         x_full = np.asarray([[new_sect[10], new_sect[11], new_sect[12], new_sect[13], new_sect[14], new_sect[15], new_sect[16], new_sect[17], new_sect[18]]])
                         y_full = clf_model_minor.predict(x_full)
                         y_full_int = y_full[0]
                         new_sect.append(y_full_int)
                         if y_full_int==1:
                             new_sect.append(x)
                             new_sect.append(y)
                             print(x,y)
                             with open('samecong_matches111.csv', 'a', encoding='UTF8', newline='') as f:
                                 writer = csv.writer(f)
                                 writer.writerow(new_sect)
                         else: pass
          
start = time.process_time()
comparisons = pairwise_dice(counter1, intro_sectlist, counter2, all_sectlist)
print(time.process_time() - start)